####################################################################################
## Chromosome	Start_position	Tumor_Sample_Barcode	Reference_Allele	Tumor_Seq_Allele2	Hugo_Symbol	Variant_Classification	Variant_Type
## 整理的格式

####################################################################################
## TCGA
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/TCGA_use.maf
cat ${work_dir}/public_ref/TCGA/TCGA_info_include_mut.tsv | grep -v Chromosome | awk -F'\t' '{OFS="\t"}{print $6,$7,$1,$12,$14,$2,$10,$11,From}' From="TCGA" \
>> ${maf_public_path}/TCGA_use.maf

dgc_sample=`cat ${work_dir}/public_ref/TCGA/TCGA_STAD.TMB.tsv | grep Diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
igc_sample=`cat ${work_dir}/public_ref/TCGA/TCGA_STAD.TMB.tsv | grep Intestinal | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`

cat ${maf_public_path}/TCGA_use.maf | grep -w -E "${dgc_sample}|Chromosome" > ${maf_public_path}/TCGA_use.DGC.maf
cat ${maf_public_path}/TCGA_use.maf | grep -w -E "${igc_sample}|Chromosome" > ${maf_public_path}/TCGA_use.IGC.maf

####################################################################################
## Oncosg
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/OncoSG_use.maf
cat ${work_dir}/public_ref/OncoSG/OncoSG_info_include_mut.tsv | grep -v Chromosome | awk -F'\t' '{OFS="\t"}{print $5,$6,$17,$12,$14,$1,$10,$11,From}' From="OncoSG" \
>> ${maf_public_path}/OncoSG_use.maf

dgc_sample=`cat ${work_dir}/public_ref/OncoSG/OncoSG_STAD.TMB.tsv | grep Diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
igc_sample=`cat ${work_dir}/public_ref/OncoSG/OncoSG_STAD.TMB.tsv | grep Intestinal | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`

cat ${maf_public_path}/OncoSG_use.maf | grep -w -E "${dgc_sample}|Chromosome" > ${maf_public_path}/OncoSG_use.DGC.maf
cat ${maf_public_path}/OncoSG_use.maf | grep -w -E "${igc_sample}|Chromosome" > ${maf_public_path}/OncoSG_use.IGC.maf

####################################################################################
## utokyo
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/utokyo_use.maf
cat ${work_dir}/public_ref/utokyo/utokyo_STAD.maf | grep -v Chromosome | awk -F'\t' '{OFS="\t"}{print $5,$6,$17,$12,$14,$1,$10,$11,From}' From="Utokyo" \
>> ${maf_public_path}/utokyo_use.maf

dgc_sample=`cat ${work_dir}/public_ref/utokyo/utokyo_STAD.TMB.tsv | grep Diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
cat ${maf_public_path}/utokyo_use.maf | grep -w -E "${dgc_sample}|Chromosome" > ${maf_public_path}/utokyo_use.DGC.maf

####################################################################################
## TMUCIH
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/TMUCIH_use.maf
cat ${work_dir}/public_ref/TMUCIH/TMUCIH_STAD.maf | grep -v Chromosome | awk -F'\t' '{OFS="\t"}{print $5,$6,$17,$12,$14,$1,$10,$11,From}' From="TMUCIH" \
>> ${maf_public_path}/TMUCIH_use.maf

dgc_sample=`cat ${work_dir}/public_ref/TMUCIH/TMUCIH_STAD.TMB.tsv | grep -w D | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
igc_sample=`cat ${work_dir}/public_ref/TMUCIH/TMUCIH_STAD.TMB.tsv | grep -w I | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`

cat ${maf_public_path}/TMUCIH_use.maf | grep -w -E "${dgc_sample}|Chromosome" > ${maf_public_path}/TMUCIH_use.DGC.maf
cat ${maf_public_path}/TMUCIH_use.maf | grep -w -E "${igc_sample}|Chromosome" > ${maf_public_path}/TMUCIH_use.IGC.maf

####################################################################################
## NMU
use_sample=`cat ${config_path}/tumor_normal.class.list | sed '1d' | grep -v "IM + IGC + DGC" | awk -F'\t' '{print $1}'| sort -u | tr '\n' '|' | sed 's/|$//'`
dgc_sample=`cat ${config_path}/tumor_normal.class.list | sed '1d' | grep -v "IM + IGC + DGC" | grep DGC | awk -F'\t' '{print $1}'| sort -u | tr '\n' '|' | sed 's/|$//'`
igc_sample=`cat ${config_path}/tumor_normal.class.list | sed '1d' | grep -v "IM + IGC + DGC" | grep IGC | awk -F'\t' '{print $1}'| sort -u | tr '\n' '|' | sed 's/|$//'`

echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/NMU_use.maf
cat ${maf_path}/All_ForMutSig.extract.cancer.maf | awk -F'\t' '{OFS="\t"}{print $0,From}' From="NJMU" | grep -E -w "${use_sample}"  \
>> ${maf_public_path}/NMU_use.maf

cat ${maf_public_path}/NMU_use.maf | grep -w -E "${dgc_sample}|Chromosome" > ${maf_public_path}/NMU_use.DGC.maf
cat ${maf_public_path}/NMU_use.maf | grep -w -E "${igc_sample}|Chromosome" > ${maf_public_path}/NMU_use.IGC.maf

## IM用的所有的MSS
## MSS的、MSI的、IGC + DGC的
cat ${maf_path}/All_ForMutSig.extract.precancer.maf | grep -E ${use_sample} | awk -F'\t' '{OFS="\t"}{print $0,From}' From="NJMU" \
> ${maf_public_path}/NMU_use.IM.maf
#cat ${maf_path}/All_ForMutSig.extract.precancer.MSI.maf | sed '1d' | awk -F'\t' '{OFS="\t"}{print $0,From}' From="NJMU" \
#>> ${maf_public_path}/NMU_use.IM.maf

####################################################################################
## 合并
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/All_raw.maf

cat ${maf_public_path}/NMU_use.maf ${maf_public_path}/OncoSG_use.maf ${maf_public_path}/TCGA_use.maf \
${maf_public_path}/utokyo_use.maf ${maf_public_path}/TMUCIH_use.maf | \
grep -v Chromosome >> ${maf_public_path}/All_raw.maf
raw_mutNum=`cat ${maf_public_path}/All_raw.maf | grep -v Chromosome | wc -l`

## 去除性染色体突变、去除长度超过50的
cat ${maf_public_path}/All_raw.maf | \
awk -F'\t' '{if((length($4)-length($5))^2<=(length_limit^2)){print}}' length_limit=50 |\
awk -F'\t' '{if($0~"Chromosome" || ($1!="X" && $1!="Y")){print}}' |\
awk -F'\t' '{OFS="\t"}{print $1,$2,$2,$3,$4,$5,$6,$7,$8,$9}' | grep -v Chromosome \
> ${maf_public_path}/All_raw.QC1.maf
QC1_mutNum=`cat ${maf_public_path}/All_raw.QC1.maf | grep -v Chromosome | wc -l`

## SimpleRepeats Region Delete
${bedtools} intersect -a ${maf_public_path}/All_raw.QC1.maf -b ~/ref/SimpleRepeats/GRCh37_SimpleRepeats.bed -v -header \
> ${maf_public_path}/All_raw.QC2.maf
QC2_mutNum=`cat ${maf_public_path}/All_raw.QC2.maf | grep -v Chromosome | wc -l`

## Seg Dup Region
${bedtools} intersect -a ${maf_public_path}/All_raw.QC2.maf -b ~/ref/SimpleRepeats/GRCh37_segdup.nochr.bed -v -header \
> ${maf_public_path}/All_raw.QC3.maf
QC3_mutNum=`cat ${maf_public_path}/All_raw.QC3.maf | grep -v Chromosome | wc -l`

## 最后使用的
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/All_use.maf
cat ${maf_public_path}/All_raw.QC3.maf | awk -F'\t' '{OFS="\t"}{print $1,$2,$4,$5,$6,$7,$8,$9,$10}' | grep -v Chromosome \
>> ${maf_public_path}/All_use.maf

echo RawMutNum","Sex_Length","SimpleRepeats_QC_num","SegDup_QC_num > ${maf_public_path}/Vcf_QC.list
echo ${raw_mutNum}","${QC1_mutNum}","${QC2_mutNum}","${QC3_mutNum} >> ${maf_public_path}/Vcf_QC.list

####################################################################################

igc_sample1=`cat ${work_dir}/public_ref/TCGA/TCGA_STAD.TMB.tsv | grep Intestinal | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
igc_sample2=`cat ${work_dir}/public_ref/OncoSG/OncoSG_STAD.TMB.tsv | grep Intestinal | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
igc_sample3=`cat ${config_path}/tumor_normal.class.list | sed '1d' | grep -v "IM + IGC + DGC" | grep IGC | awk -F'\t' '{print $1}'| sort -u | tr '\n' '|' | sed 's/|$//'`
igc_sample4=`cat ${work_dir}/public_ref/TMUCIH/TMUCIH_STAD.TMB.tsv | grep -w I | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`

#igc_sample4=`cat ${work_dir}/public_ref/HK/HK_STAD.TMB.tsv | grep -v Patient | grep intestinal | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
#igc_sample="${igc_sample1}|${igc_sample2}|${igc_sample3}|${igc_sample4}"
igc_sample="${igc_sample1}|${igc_sample2}|${igc_sample3}|${igc_sample4}"

dgc_sample1=`cat ${work_dir}/public_ref/TCGA/TCGA_STAD.TMB.tsv | grep Diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
dgc_sample2=`cat ${work_dir}/public_ref/OncoSG/OncoSG_STAD.TMB.tsv | grep Diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
dgc_sample3=`cat ${config_path}/tumor_normal.class.list | sed '1d' | grep -v "IM + IGC + DGC" | grep DGC | awk -F'\t' '{print $1}'| sort -u | tr '\n' '|' | sed 's/|$//'`
dgc_sample4=`cat ${work_dir}/public_ref/utokyo/utokyo_STAD.TMB.tsv | grep Diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
dgc_sample5=`cat ${work_dir}/public_ref/TMUCIH/TMUCIH_STAD.TMB.tsv | grep -w D | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`

#dgc_sample4=`cat ${work_dir}/public_ref/HK/HK_STAD.TMB.tsv | grep -v Patient | grep diffuse | awk -F'\t' '{print $1}' | tr '\n' '|' | sed 's/|$//'`
#dgc_sample="${dgc_sample1}|${dgc_sample2}|${dgc_sample3}|${dgc_sample4}"
dgc_sample="${dgc_sample1}|${dgc_sample2}|${dgc_sample3}|${dgc_sample4}|${dgc_sample5}"

## IGC的样本
cat ${maf_public_path}/All_use.maf | grep -w -E "Chromosome" > ${maf_public_path}/All_use.IGC.maf
for sample in `echo "Chromosome|${igc_sample}" | tr '|' '\n'`
do
cat ${maf_public_path}/All_use.maf | awk -F'\t' '{OFS="\t"}{if($3==sample){print}}' sample=${sample} >> ${maf_public_path}/All_use.IGC.maf
done

## DGC的样本
cat ${maf_public_path}/All_use.maf | grep -w -E "Chromosome" > ${maf_public_path}/All_use.DGC.maf
for sample in `echo "Chromosome|${dgc_sample}" | tr '|' '\n'`
do
cat ${maf_public_path}/All_use.maf | awk -F'\t' '{OFS="\t"}{if($3==sample){print}}' sample=${sample} >> ${maf_public_path}/All_use.DGC.maf
done

## IM的样本
echo -e "Chromosome\tStart_position\tTumor_Sample_Barcode\tReference_Allele\tTumor_Seq_Allele2\tHugo_Symbol\tVariant_Classification\tVariant_Type\tFrom" \
> ${maf_public_path}/All_use.IM.maf
cat ${maf_public_path}/NMU_use.IM.maf | grep -v Chromosome >> ${maf_public_path}/All_use.IM.maf